## Imports all necessary modules
import networkx as nx
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from networkx import community
from networkx.algorithms.community import greedy_modularity_communities
import random
import math
import csv
from sklearn import svm
from sklearn.metrics import accuracy_score , f1_score
from sklearn.preprocessing import normalize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
G_fbsocial = nx.read_edgelist("facebook_combined.txt", create_using = nx.Graph(), nodetype=int)
print(nx.info(G_fbsocial))
print("Number of Actors:", nx.number_of_nodes(G_fbsocial),'\n')
print("Number of Ties:", nx.number_of_edges(G_fbsocial),'\n')
print("Diameter:", nx.diameter(G_fbsocial),'\n')
print("Average Clustering:", nx.average_clustering(G_fbsocial),'\n')
print("Network Density:", nx.density(G_fbsocial), '\n')
print("Connected Graph:", nx.is_connected(G_fbsocial))
def plt_network(G):
pos = nx.spring_layout(G)
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (30, 25)
plt.axis('off')
nx.draw_networkx(G, pos, with_labels = False, node_size = 30, node_color='r')
plt.show()
plt_network(G_fbsocial)
Let's create a subgraph with minimum degree 50.
node_degree_dict=nx.degree(G_fbsocial)
G_temp=nx.subgraph(G_fbsocial,[x for x in G_fbsocial.nodes() if node_degree_dict[x]>50])
G_subgraph_fb = nx.Graph(G_temp)
G_subgraph_fb.remove_nodes_from(list(nx.isolates(G_subgraph_fb)))
print(nx.info(G_subgraph_fb))
plt_network(G_subgraph_fb)
###Define function to plot the centrality
def plt_network_centrality(G, central_measure, central_list):
labels = {}
for node in G.nodes():
if node in central_list:
labels[node] = node
pos = nx.spring_layout(G)
node_color = [20000.0 * G.degree(v) for v in G]
node_size = [v * 10000 for v in central_measure.values()]
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (30, 25)
nx.draw_networkx(G, pos=pos, with_labels=False,
node_color=node_color,
node_size=node_size )
nx.draw_networkx_labels(G,pos,labels,font_size=35,font_color='r')
plt.axis('off')
The degree of a actor is defined as number of connecting ties particular actor has with others. To calculate Degree centrality of a actor, the degree of a actor is divided by number of other actors in the network(n-1).
Degree centrality metric defines importance of a actor in a network as being measured based on its degree i.e the higher the degree of a actor, the more important it is in a network.
degree_central = nx.degree_centrality(G_fbsocial)
top_degree_central = sorted(degree_central, key=degree_central.get, reverse=True)[:5]
print("Top 5 degree centrality nodes:", top_degree_central)
plt_network_centrality(G_fbsocial,degree_central, top_degree_central)
G_fbsocial.degree(107)
G_fbsocial.degree(1684)
G_fbsocial.degree(1912)
G_fbsocial.degree(3437)
G_fbsocial.degree(0)
degree_central_sub = nx.degree_centrality(G_subgraph_fb)
top_degree_central_sub = sorted(degree_central_sub, key=degree_central_sub.get, reverse=True)[:5]
print("Top 5 degree centrality nodes:", top_degree_central_sub)
plt_network_centrality(G_subgraph_fb,degree_central_sub, top_degree_central_sub)
Closeness centrality metric defines the importance of a actor in a network as being measured by how close it is to all other actors in the network.For a actor, it is defined as the average of the geodesic distance between that actor to all other actors in the network.
close_central = nx.closeness_centrality(G_fbsocial)
top_close_central = sorted(close_central, key=close_central.get, reverse=True)[:5]
print("Top 5 closeness centrality nodes:", top_close_central)
closeness_centrality = sorted(close_central.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
closeness_centrality
plt_network_centrality(G_fbsocial,close_central, top_close_central)
short_path = nx.single_source_shortest_path_length(G_fbsocial, source = 107)
k = sum(short_path.values())
print("Sum of geodesic distance from '107' is:",k)
short_path = nx.single_source_shortest_path_length(G_fbsocial, source = 58)
k = sum(short_path.values())
print("Sum of geodesic distance from '58' is:",k)
short_path = nx.single_source_shortest_path_length(G_fbsocial, source = 428)
k = sum(short_path.values())
print("Sum of geodesic distance from '428' is:",k)
short_path = nx.single_source_shortest_path_length(G_fbsocial, source = 563)
k = sum(short_path.values())
print("Sum of geodesic distance from '563' is:",k)
short_path = nx.single_source_shortest_path_length(G_fbsocial, source = 1684)
k = sum(short_path.values())
print("Sum of geodesic distance from '1684' is:",k)
close_central_sub = nx.closeness_centrality(G_subgraph_fb)
top_close_central_sub = sorted(close_central_sub, key=close_central_sub.get, reverse=True)[:5]
print("Top 5 closeness centrality nodes:", top_close_central_sub)
closeness_centrality_sub = sorted(close_central_sub.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
closeness_centrality_sub
plt_network_centrality(G_subgraph_fb,close_central_sub, top_close_central_sub)
Betweeness centrality metric defines and measures the importance of a actor in a network based upon how many times it occurs in the shortest path between all pair of actors in the network. The actor with highest betweenness centrality acts like a bridge between two social groups.
between_central = nx.betweenness_centrality(G_fbsocial, normalized=True, endpoints=True)
top_between_central = sorted(between_central, key=between_central.get, reverse=True)[:5]
print("Top 5 betweeness centrality nodes:", top_between_central)
betweenness_centrality = sorted(between_central.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
betweenness_centrality
plt_network_centrality(G_fbsocial,between_central, top_between_central )
between_central_sub = nx.betweenness_centrality(G_subgraph_fb, normalized=True, endpoints=True)
top_between_central_sub = sorted(between_central_sub, key=between_central_sub.get, reverse=True)[:5]
print("Top 5 betweeness centrality nodes:", top_between_central_sub)
betweenness_centrality_sub = sorted(between_central_sub.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
betweenness_centrality_sub
plt_network_centrality(G_subgraph_fb,between_central_sub, top_between_central_sub)
Eigenvector centrality is a measure of the influence of a actor in a network. It assigns relative scores to all actors in the network based on the concept that connections to high-scoring actors contribute more to the score of the actor in question than equal connections to low-scoring actors.
eigen_central = nx.eigenvector_centrality(G_fbsocial)
top_eigen_central = sorted(eigen_central, key=eigen_central.get, reverse=True)[:5]
print("Top 5 eigenvector centrality nodes:", top_eigen_central)
eigen_centrality = sorted(eigen_central.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
eigen_centrality
plt_network_centrality(G_fbsocial,eigen_central, top_eigen_central)
eigen_central_sub = nx.eigenvector_centrality(G_subgraph_fb)
top_eigen_central_sub = sorted(eigen_central_sub, key=eigen_central_sub.get, reverse=True)[:5]
print("Top 5 eigenvector centrality nodes:", top_eigen_central_sub)
eigen_centrality_sub = sorted(eigen_central_sub.items(), key = lambda i:(i[1], i[0]), reverse = True)[:5]
eigen_centrality_sub
plt_network_centrality(G_subgraph_fb,eigen_central_sub, top_eigen_central_sub)
def create_subgraph(G, user):
first_degree_connected_nodes = list(G.neighbors(user))
second_degree_connected_nodes = []
for x in first_degree_connected_nodes:
second_degree_connected_nodes+=list(G.neighbors(x))
second_degree_connected_nodes.remove(user)
second_degree_connected_nodes = list(set(second_degree_connected_nodes))
subgraph_user = nx.subgraph(G,first_degree_connected_nodes+second_degree_connected_nodes,)
return subgraph_user
def plot_subgraph(G, node):
pos = nx.spring_layout(G)
node_color = ['yellow' if v == node else 'red' for v in G]
node_size = [1000 if v == node else 35 for v in G]
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (30, 25)
plt.axis('off')
nx.draw_networkx(G, pos, with_labels = False, node_color=node_color,node_size=node_size )
plt.show()
def edge_to_remove(G):
G_edge_dict = nx.edge_betweenness_centrality(G)
edge = ()
# extract the edge with highest edge betweenness centrality score
for key, value in sorted(G_edge_dict.items(), key=lambda item: item[1], reverse = True):
edge = key
break
return edge
def girvan_newman(G, comm_num):
# find number of connected components
connect = nx.connected_components(G)
connect_count = nx.number_connected_components(G)
while(connect_count <= (comm_num - 1)):
G.remove_edge(edge_to_remove(G)[0], edge_to_remove(G)[1])
connect = nx.connected_components(G)
connect_count = nx.number_connected_components(G)
if nx.number_of_edges(G) < 100:
connect_count = comm_num
return connect , G
def community_node(G, comm_num = 2):
# find communities in the graph
modularity = []
node_groups = []
communities ,graph = girvan_newman(G.copy(), comm_num)
original_graph = G.copy()
m = original_graph.number_of_edges()
q = 0
for group in communities:
node_groups.append(list(group))
for nodei in group:
for nodej in group:
ki = original_graph.degree(nodei)
kj = original_graph.degree(nodej)
aij = 0
if graph.has_edge(nodei, nodej) is True:
aij = 1
else:
aij = 0
q = aij - (ki*kj)/(m*2) +q
q = q/(2*m)
modularity.append(q)
return graph , modularity, node_groups
def plot_community_subgraph(G, node_groups):
# plot the communities
colors = ["green", "red", "orange", "blue", "violet", "cyan", "yellow", "indigo", "pink", "black", "aliceblue", "aqua", "aquamarine", "darkturquoise", "royalblue"]
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (30, 25)
pos = nx.spring_layout(G)
for i in range(len(node_groups)):
graph = node_groups[i]
node_list = [node for node in graph]
nx.draw(G, pos,with_labels = False, nodelist=node_list, node_color=colors[i%10], alpha = 0.8)
plt.show()
graph_fb, modularity_fb, community_node_groups_fb = community_node(G_fbsocial, 2)
modularity_fb
print("Modularity:", nx.algorithms.community.modularity(G_fbsocial, community_node_groups_fb))
print(community_node_groups_fb)
plot_community_subgraph(G_fbsocial, community_node_groups_fb)
greed_community_fb = greedy_modularity_communities(G_fbsocial)
greed_list_fb = list(greed_community_fb)
print("number of communities is", len(list(greed_community_fb)))
print(greed_list_fb)
print("Modularity:", nx.algorithms.community.modularity(G_fbsocial, greed_list_fb))
plot_community_subgraph(G_fbsocial, greed_list_fb)
graph_subfb, modularity_subfb, community_node_groups_subfb = community_node(G_subgraph_fb, 5)
modularity_subfb
print("Modularity:", nx.algorithms.community.modularity(G_subgraph_fb, community_node_groups_subfb))
plot_community_subgraph(G_subgraph_fb, community_node_groups_subfb)
greed_community_subfb = greedy_modularity_communities(G_subgraph_fb)
greed_list_subfb = list(greed_community_subfb)
print("number of communities is", len(list(greed_community_subfb)))
print(greed_list_subfb)
print("Modularity:", nx.algorithms.community.modularity(G_subgraph_fb, greed_list_subfb))
plot_community_subgraph(G_subgraph_fb, greed_list_subfb)
subgraph_1912 = create_subgraph(G_fbsocial, 1912)
print(nx.info(subgraph_1912))
plot_subgraph(subgraph_1912, 1912)
graph_1912, modularity_1912, community_node_groups_1912 = community_node(subgraph_1912, 5)
modularity_1912
print("Modularity:", nx.algorithms.community.modularity(subgraph_1912, community_node_groups_1912))
plt_network(graph_1912)
plot_community_subgraph(subgraph_1912, community_node_groups_1912)
greed_community_1912 = greedy_modularity_communities(subgraph_1912)
greed_list_1912 = list(greed_community_1912)
print("number of communities is", len(list(greed_community_1912)))
print(greed_list_1912)
print("Modularity:", nx.algorithms.community.modularity(subgraph_1912, greed_list_1912))
plot_community_subgraph(subgraph_1912, greed_list_1912)
subgraph_107 = create_subgraph(G_fbsocial, 107)
print(nx.info(subgraph_107))
plot_subgraph(subgraph_107, 107)
graph_107, modularity_107, community_node_groups_107 = community_node(subgraph_107, 2)
modularity_107
print("Modularity:", nx.algorithms.community.modularity(subgraph_107, community_node_groups_107))
print(community_node_groups_107)
plt_network(graph_107)
plot_community_subgraph(subgraph_107, community_node_groups_107)
greed_community_107 = greedy_modularity_communities(subgraph_107)
greed_list_107 = list(greed_community_107)
print("number of communities is", len(list(greed_community_107)))
print(greed_list_107)
print("Modularity:", nx.algorithms.community.modularity(subgraph_107, greed_list_107))
plot_community_subgraph(subgraph_107, greed_list_107)
####Let's define the neighbourhood
def CommonNeighbors(u, v, g):
u_neighbors = set(g.neighbors(u))
v_neighbors = set(g.neighbors(v))
return len(u_neighbors.intersection(v_neighbors))
def common_neighbors(g, edges):
list_neighbours = []
for edge in edges:
node_one, node_two = edge[0], edge[1]
num_common_neighbors = 0
try:
neighbors_one, neighbors_two = g.neighbors(node_one), g.neighbors(node_two)
for neighbor in neighbors_one:
if neighbor in neighbors_two:
num_common_neighbors += 1
list_neighbours.append((node_one, node_two, num_common_neighbors))
except:
pass
return list_neighbours
####Let's define the feature set
feature_set = [common_neighbors,
nx.resource_allocation_index,
nx.jaccard_coefficient,
nx.adamic_adar_index,
nx.preferential_attachment
]
#### Let's create the fakeedges for negative sample
def produce_fake_edge(g, neg_g,num_test_edges):
i = 0
while i < num_test_edges:
edge = random.sample(g.nodes(), 2)
try:
shortest_path = nx.shortest_path_length(g,source=edge[0],target=edge[1])
if shortest_path >= 2:
neg_g.add_edge(edge[0],edge[1], positive="False")
i += 1
except:
pass
def sample_extraction(g, pos_num, neg_num):
# randomly select pos_num as test edges
print("----------------extract positive samples--------------------")
pos_sample = random.sample(g.edges(), pos_num)
# adding non-existing edges
print("----------------extract negative samples--------------------")
neg_g = nx.Graph()
produce_fake_edge(g,neg_g,neg_num)
neg_sample = neg_g.edges()
# remove the positive sample edges, the rest is the training set
g.remove_edges_from(pos_sample)
return pos_sample, neg_sample
def feature_extraction(g, pos_sample, neg_sample, feature_name, model="single", combine_num=2):
data = []
label = ["label"] + ["1" for i in range(len(pos_sample))] + ["0" for i in range(len(neg_sample))]
for j in feature_name:
print ("-----extract feature:", j.__name__, "----------")
preds = j(g, pos_sample)
feature = [j.__name__] + [i[2] for i in preds]
preds = j(g, neg_sample)
feature = feature + [i[2] for i in preds]
data.append(feature)
data.append(label)
data = transpose(data)
print("----------write the features to file---------------")
write_data_to_file(data, "features_" + model + "_" + str(combine_num) + ".csv")
return data
def write_data_to_file(data, filename):
csvfile = open(filename, "w")
writer = csv.writer(csvfile)
for i in data:
writer.writerow(i)
csvfile.close()
def transpose(data):
return [list(i) for i in zip(*data)]
def link_predict(filename="facebook_combined.txt", pos_num=0.5, neg_num=0.5, model="combined", combine_num=1,
feature_name=common_neighbors):
g = nx.read_edgelist(filename, create_using = nx.Graph(), nodetype=int)
num_edges = g.number_of_edges()
pos_num = int(num_edges * pos_num)
neg_num = int(num_edges * neg_num)
pos_sample, neg_sample = sample_extraction(g, pos_num, neg_num)
train_data = feature_extraction(g, pos_sample, neg_sample, feature_name, model, combine_num)
link_predict(filename="facebook_combined.txt",model="combined",combine_num=2, feature_name=feature_set)
data_array = np.loadtxt(open("features_combined_2.csv", "rb"), delimiter=",", skiprows=1)
a,b=data_array.shape;
np.random.shuffle(data_array);
train_l=int(0.8*a)
X_train=data_array[0:train_l,0:b-1]
Y_train=data_array[0:train_l,b-1]
X_test=data_array[train_l:a,0:b-1]
Y_test=data_array[train_l:a,b-1]
X_train = normalize(X_train, axis=0, norm='max')
X_test = normalize(X_test, axis=0, norm='max')
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
def svm_model(training, training_labels, testing, testing_labels):
#Support Vector Machine
clf = svm.SVC()
clf.fit(training, training_labels)
result = clf.predict(testing)
print("SVM accuracy:", accuracy_score(testing_labels, result))
print("SVM F1 Score:", f1_score(result, testing_labels,average='macro'))
svm_model(X_train,Y_train,X_test,Y_test)
def logistic_model(training, training_labels, testing, testing_labels):
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='ovr').fit(training, training_labels)
clf.fit(training, training_labels)
result=clf.predict(testing)
print ("Logistic regression accuracy:", accuracy_score(testing_labels, result))
print("Logistic regression F1 Score:", f1_score(result, testing_labels,average='macro'))
logistic_model(X_train,Y_train,X_test,Y_test)